suppressPackageStartupMessages({
  library(Matrix) # Required to work with sparse matrices
  library(dplyr)
  library(Seurat)
  library(patchwork)
  library(future) # For parallelization
  library(stringr)
  library(TxDb.Hsapiens.UCSC.hg38.knownGene)
  library(org.Hs.eg.db)
})
#Set/Create Working Directory to Folder
wd <- "/raida/ayaka.mori/integrative_dp/rna_preprocessing/preprocessing_output_simple"
dir.create(wd, showWarnings = FALSE, recursive = TRUE)
knitr::opts_knit$set(root.dir = wd)
scriptPath <- "/raida/ayaka.mori/integrative_dp/script"
source(paste0(scriptPath, "/matrix_helpers.R"))
source(paste0(scriptPath, "/plotting_config.R"))
source(paste0(scriptPath, "/seurat_helpers.R"))
source(paste0(scriptPath, "/misc_helpers.R"))
source(paste0(scriptPath, "/sample_metadata.R"))

plotDir <- paste0(wd,"/raw_qc")
dir.create(plotDir, showWarnings = FALSE, recursive = TRUE)
# Load each of the scalp datasets
raw_data_dir <- "/raida/ayaka.mori/integrative_dp/scRNA-seq_cellranger"
data_dirs <- list.dirs(path=raw_data_dir, full.names=TRUE, recursive=FALSE)
names(data_dirs) <- str_extract(data_dirs, "(?<=/)[^/]*$")

objs <- list()
for(ix in seq_along(data_dirs)){
  sample <- names(data_dirs)[ix]
  path <- data_dirs[ix]
  message(sprintf("Reading in data from sample %s...", sample))
  data <- Read10X(data.dir=path)
  obj <- CreateSeuratObject(counts=data, project=sample, min.cells=0, min.features=200) # orig.ident not getting assigned correctly?
  obj$orig.ident <- sample
  objs[[sample]] <- obj
}
## Reading in data from sample GSM6532919_AA2...
## Reading in data from sample GSM6532920_AA4...
## Reading in data from sample GSM6532921_AA7...
## Reading in data from sample GSM6532922_AA8...
## Reading in data from sample GSM6532923_C_PB1...
## Reading in data from sample GSM6532924_C_PB2...
## Reading in data from sample GSM6532925_C_PB3...
## Reading in data from sample GSM6532926_C_SD1...
## Reading in data from sample GSM6532927_C_SD2...
## Reading in data from sample GSM6532928_C_SD3...
objNames <- sapply(objs, function(x) x@project.name)
# Merge individual Seurat objects
obj <- merge(x=objs[[1]], y=objs[2:length(objs)], add.cell.ids=objNames, project="scalp")
# Identify genes we want to blacklist during clustering
rawCounts <- GetAssayData(object=obj, slot="counts")
# mitochondrial:
mt.genes <- grep(pattern = "^MT-", x = rownames(rawCounts), value = TRUE)
# Cell cycle (These are loaded by Seurat)
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes

# X/Y chromosome genes:
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
geneGR <- GenomicFeatures::genes(txdb)
##   1690 genes were dropped because they have exons located on both strands
##   of the same reference sequence or on more than one reference sequence,
##   so cannot be represented by a single genomic range.
##   Use 'single.strand.genes.only=FALSE' to get all the genes in a
##   GRangesList object, or use suppressMessages() to suppress this message.
sexGenesGR <- geneGR[seqnames(geneGR) %in% c("chrY", "chrX")]
matchedGeneSymbols <- AnnotationDbi::select(org.Hs.eg.db,
                        keys = sexGenesGR$gene_id,
                        columns = c("ENTREZID", "SYMBOL"),
                        keytype = "ENTREZID")
## 'select()' returned 1:1 mapping between keys and columns
sexChr.genes <- matchedGeneSymbols$SYMBOL


# Genes to ignore (just for clustering purposes)
blacklist.genes <- c(
    mt.genes,
    sexChr.genes,
    s.genes,
    g2m.genes
)

# Add cell cycle information now (Using Seurat):
obj <- CellCycleScoring(obj, s.features=s.genes, g2m.features=g2m.genes, set.ident=FALSE)
## Warning: The following features are not present in the object: MLF1IP, not
## searching for symbol synonyms
## Warning: The following features are not present in the object: FAM64A, HN1, not
## searching for symbol synonyms
obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size = 0.1)

#Visualize feature-feature relationships
plot1 <- FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

#Filter cells based on QC metrics
obj <- subset(obj, subset = nFeature_RNA > 200 & nFeature_RNA < 7000 & percent.mt < 10)
#Visualize feature-feature relationships
plot1 <- FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

#Normalizing the data
obj <- NormalizeData(obj)
#SCTransform(obj, vars.to.regress = "percent.mt", verbose = FALSE)
obj <- SCTransform(obj, vars.to.regress = "percent.mt", verbose = FALSE)
obj <- RunPCA(obj, features = VariableFeatures(object = obj))
## PC_ 1 
## Positive:  HLA-DRA, CD74, HLA-DPA1, HLA-DRB1, HLA-DPB1, HLA-DQA1, LYZ, HLA-DQB1, SRGN, CXCL8 
##     GPR183, IL1B, FTH1, G0S2, HLA-DRB5, SAT1, C15orf48, CD83, TYROBP, CPVL 
##     FCER1G, BCL2A1, RGS1, FCER1A, AIF1, PLEK, TMSB4X, SERPINB9, IL1R2, INSIG1 
## Negative:  DCN, CFD, APOD, COL1A2, MGP, PTGDS, COL1A1, COL3A1, CCDC80, CXCL14 
##     C11orf96, CFH, LUM, COL6A2, FBLN1, APOE, POSTN, TNFAIP6, COL6A1, GSN 
##     CXCL12, IGFBP5, SPARC, C1R, CTSK, MEG3, CCL2, CXCL1, C1S, SFRP2 
## PC_ 2 
## Positive:  DCN, CFD, HLA-DRA, APOD, CD74, HLA-DPA1, HLA-DRB1, HLA-DPB1, COL1A2, HLA-DQA1 
##     PTGDS, CST3, MGP, CCDC80, COL3A1, COL1A1, LYZ, C11orf96, HLA-DQB1, CFH 
##     CXCL8, FTH1, VIM, G0S2, IL1B, GSN, HLA-DRB5, LUM, GPR183, TNFAIP6 
## Negative:  KRT10, KRT14, KRT1, S100A2, DMKN, SFN, S100A8, KRT5, KRTDAP, S100A9 
##     PERP, S100A7, LY6D, KRT17, AQP3, SBSN, DSP, KRT16, S100A14, SERPINB2 
##     KRT6A, LYPD3, FABP5, FGFBP1, LGALS7B, TACSTD2, KRT15, DSC3, SERPINB5, KRT6B 
## PC_ 3 
## Positive:  DCN, CFD, KRT10, KRT1, APOD, HLA-DRA, KRT14, COL1A2, CXCL14, DMKN 
##     S100A8, HLA-DPA1, PTGDS, HLA-DPB1, S100A9, HLA-DQA1, KRTDAP, CCDC80, CST3, S100A2 
##     LYZ, COL1A1, SFN, COL3A1, HLA-DRB1, CXCL8, MGP, CD74, S100A7, KRT5 
## Negative:  SELE, ACKR1, TM4SF1, TAGLN, SPARCL1, IL6, ACTA2, STC1, C11orf96, ADAMTS1 
##     TPM2, MYL9, C2CD4B, MYH11, CSF3, ADIRF, ADAMTS9, IGFBP7, AVPR1A, AQP1 
##     IL7R, RGS5, PTPRC, MT1A, MCAM, CXCR4, FABP4, MCTP1, GJA4, PECAM1 
## PC_ 4 
## Positive:  PTPRC, IL7R, CXCR4, SRGN, CD69, CCL5, IL32, CD2, TRBC2, TSC22D3 
##     LTB, RGS1, TXNIP, TRAC, CLEC2D, ZFP36L2, TRBC1, CD52, CD3D, FYB1 
##     STK4, KLRB1, CYTIP, SPOCK2, GZMK, ARHGDIB, ALOX5AP, RHOH, DUSP4, TRAT1 
## Negative:  TM4SF1, SELE, ACKR1, IL6, SPARCL1, C11orf96, STC1, TAGLN, HLA-DRA, ADAMTS1 
##     CSF3, C2CD4B, CD74, ACTA2, IGFBP7, G0S2, HLA-DRB1, ADAMTS9, S100A8, IFI27 
##     KRT10, HLA-DPA1, TPM2, ADIRF, MYL9, CCL2, S100A9, AQP1, MYH11, KRT1 
## PC_ 5 
## Positive:  TAGLN, C11orf96, ACTA2, TPM2, MYL9, MYH11, AVPR1A, MT1A, RGS5, GJA4 
##     CYCS, CCL2, KCNE4, SORBS2, CALD1, CRISPLD2, RRAD, RERGL, MYLK, ADRA2A 
##     TPM1, ADIRF, MCAM, PLN, MT1M, RHOB, CNN1, ZNF331, PDGFA, SYNM 
## Negative:  SELE, ACKR1, TM4SF1, STC1, CSF3, C2CD4B, AQP1, IFI27, MCTP1, SPRY1 
##     FLT1, PECAM1, PLVAP, ADAMTS9, SOX17, IL6, CLDN5, DCN, ZNF385D, CALCRL 
##     CLU, ADGRL4, CFD, GNG11, TSC22D1, EMP1, EMCN, VWF, RCAN1, VCAM1
ElbowPlot(obj, ndims = 50)

# Store sample information in metadata
obj$Sample <- obj$orig.ident
# Store disease status in metadata
obj$diseaseStatus <- NA
obj$diseaseStatus <- ifelse(grepl("C_SD", obj$Sample), "C_SD", obj$diseaseStatus)
obj$diseaseStatus <- ifelse(grepl("C_PB", obj$Sample), "C_PB", obj$diseaseStatus)
obj$diseaseStatus <- ifelse(grepl("AA", obj$Sample), "AA", obj$diseaseStatus)
library(harmony)
obj <- obj %>% RunHarmony("Sample", plot_convergence = TRUE)
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony 4/10
## Harmony 5/10
## Harmony 6/10
## Harmony 7/10
## Harmony 8/10
## Harmony 9/10
## Harmony 10/10
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.SCT.harmony; see ?make.names for more details
## on syntax validity

VlnPlot(obj, features = "PC_1", group.by = "Sample", pt.size = 0)

DimPlot(obj, reduction = "harmony", pt.size = 0.1, group.by = "Sample")

DimPlot(obj, reduction = "harmony", pt.size = 0.1, group.by = "diseaseStatus")

ElbowPlot(obj, ndims = 50, reduction = "harmony")

obj <- RunUMAP(obj, reduction = "harmony", dims = 1:20, n.neighbors = 50L, min.dist = 0.5)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 14:36:37 UMAP embedding parameters a = 0.583 b = 1.334
## 14:36:37 Read 52465 rows and found 20 numeric columns
## 14:36:37 Using Annoy for neighbor search, n_neighbors = 50
## 14:36:37 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:36:42 Writing NN index file to temp file /tmp/RtmpQV9Mzq/fileeb2a591db1af
## 14:36:42 Searching Annoy index using 1 thread, search_k = 5000
## 14:37:08 Annoy recall = 100%
## 14:37:09 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 50
## 14:37:12 Initializing from normalized Laplacian + noise (using irlba)
## 14:37:19 Commencing optimization for 200 epochs, with 3657800 positive edges
## 14:37:45 Optimization finished
obj <- FindNeighbors(obj, dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN
obj <- FindClusters(obj, resolution = 0.8)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 52465
## Number of edges: 1666475
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9189
## Number of communities: 24
## Elapsed time: 8 seconds
# Store cluster information in metadata
obj$Clusters <- Idents(obj)
sample_colors <- c("GSM6532919_AA2" = "#CAB2D6", "GSM6532920_AA4" = "#B2DF8A", "GSM6532921_AA7" = "#FDBF6F",
   "GSM6532922_AA8" = "#A6CCC3", "GSM6532923_C_PB1" = "#E31A1C",
   "GSM6532924_C_PB2" = "#33A02C", "GSM6532925_C_PB3" = "#FB9A99", "GSM6532926_C_SD1" = "#FF7F00",
   "GSM6532927_C_SD2" = "#CCFFFF", "GSM6532928_C_SD3" = "#000000")
DimPlot(obj,reduction="umap", group.by = "Sample", cols = sample_colors)

DimPlot(obj,reduction="umap", group.by = "Clusters", label = TRUE)

sample_cmap <- readRDS("/raida/ayaka.mori/integrative_dp/script/sample_cmap.rds")
sample_cmap <- sample_cmap[names(sample_cmap) %in% unique(obj$Sample)]

disease_cmap <- head(cmaps_BOR$stallion, 3)
names(disease_cmap) <- c("AA", "C_SD", "C_PB")
# Plot clustering results:
plotDir <- paste0(wd,"/clustering_qc")
dir.create(plotDir, showWarnings = FALSE, recursive = TRUE)
plotClusterQC(obj, subgroup="preclustered", plotDir=plotDir, pointSize=0.2, sampleCmap=sample_cmap, diseaseCmap=disease_cmap)
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'xgroup'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'xgroup'. You can override using the
## `.groups` argument.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## png 
##   2
# Save clustered object here:
saveRDS(obj, file = paste0(wd, "/preclustered.rds"))
# annotation
RidgePlot(obj, features = c("KRT14", "KRT5", "KRT15", "KRT10", "SOX9")) #Keratinocytes
## Picking joint bandwidth of 0.129
## Picking joint bandwidth of 0.0875
## Picking joint bandwidth of 0.0449
## Picking joint bandwidth of 0.129
## Picking joint bandwidth of 0.00392

RidgePlot(obj, features = c("CLEC9A", "XCR1")) #Myeloid(CLEC9a.DC)
## Picking joint bandwidth of 2.14e-06
## Picking joint bandwidth of 2.11e-06

RidgePlot(obj, features = c("CLEC9A")) #Myeloid(CLEC9a.DC)
## Picking joint bandwidth of 2.14e-06

RidgePlot(obj, features = c("XCR1")) #Myeloid(CLEC9a.DC)
## Picking joint bandwidth of 2.11e-06

RidgePlot(obj, features = c("CD1A")) #Myeloid(DCs_1)
## Picking joint bandwidth of 0.00386

RidgePlot(obj, features = c("CD14", "CD163")) #Myeloid(Macs_1)
## Picking joint bandwidth of 0.00682
## Picking joint bandwidth of 0.00595

RidgePlot(obj, features = c("CD14")) #Myeloid(Macs_1)
## Picking joint bandwidth of 0.00682

RidgePlot(obj, features = c("CD163")) #Myeloid(Macs_1)
## Picking joint bandwidth of 0.00595

RidgePlot(obj, features = c("CCL19", "CD200")) #Myeloid(M1_Macs)
## Picking joint bandwidth of 0.0285
## Picking joint bandwidth of 0.0175

RidgePlot(obj, features = c("CCL19")) #Myeloid(M1_Macs)
## Picking joint bandwidth of 0.0285

RidgePlot(obj, features = c("CD200")) #Myeloid(M1_Macs)
## Picking joint bandwidth of 0.0175

RidgePlot(obj, features = c("CD79A")) #Myeloid(Plasma)
## Picking joint bandwidth of 0.0217

RidgePlot(obj, features = c("CD3D", "IKZF2", "CCL5", "CD8A"))#Tcells
## Picking joint bandwidth of 0.0168
## Picking joint bandwidth of 2.3e-06
## Picking joint bandwidth of 0.00566
## Picking joint bandwidth of 0.00591

RidgePlot(obj, features = c("KIT", "HPGD", "TPSB2")) #Mastcells
## Picking joint bandwidth of 0.0212
## Picking joint bandwidth of 0.0186
## Picking joint bandwidth of 0.014

RidgePlot(obj, features = c("THY1","COL1A1","COL11A1")) #Fibroblasts
## Picking joint bandwidth of 0.0288
## Picking joint bandwidth of 0.0489
## Picking joint bandwidth of 0.0078

RidgePlot(obj, features = c("MYL9","TPM1","TAGLN")) #Muscle
## Picking joint bandwidth of 0.0353
## Picking joint bandwidth of 0.0515
## Picking joint bandwidth of 0.028

RidgePlot(obj, features = c("SELE","VWF","PECAM1","CD200")) #Endothelial
## Picking joint bandwidth of 0.0161
## Picking joint bandwidth of 0.0188
## Picking joint bandwidth of 0.028
## Picking joint bandwidth of 0.0175

RidgePlot(obj, features = c("LYVE1","FLT4","VWF","PECAM1","CD200")) #Lymphatic
## Picking joint bandwidth of 0.00953
## Picking joint bandwidth of 0.00508
## Picking joint bandwidth of 0.0188
## Picking joint bandwidth of 0.028
## Picking joint bandwidth of 0.0175

RidgePlot(obj, features = c("MLANA", "MITF","SOX10", "KIT")) #Melanocytes
## Picking joint bandwidth of 0.0116
## Picking joint bandwidth of 0.0191
## Picking joint bandwidth of 0.00588
## Picking joint bandwidth of 0.0212

RidgePlot(obj, features = c("SOX10", "ITGB8")) #McSc
## Picking joint bandwidth of 0.00588
## Picking joint bandwidth of 0.0125

RidgePlot(obj, features = c("KLRK1")) #NKG2D+かどうかの確認
## Picking joint bandwidth of 2.1e-06

obj_tcell <- subset(obj, subset = Clusters %in% c("0","1","11")) #Tcells
obj_fib <- subset(obj, subset = Clusters %in% c("2","3","10")) #Fibroblasts
obj_mye <- subset(obj, subset = Clusters %in% c("4","12","22","23")) #Myeloid
obj_end <- subset(obj, subset = Clusters %in% c("5","16")) #Endothelial
obj_kera <- subset(obj, subset = Clusters %in% c("6","7","8","13","15","18")) #Keratinocytes
obj_mus <- subset(obj, subset = Clusters %in% c("9","14")) #Muscle
obj_mast <- subset(obj, subset = Clusters == "17") #Mastcells
obj_mela <- subset(obj, subset = Clusters == "20") #Melanocytes
obj_lym <- subset(obj, subset = Clusters == "21") #Lymphatic
obj_other <- subset(obj, subset = Clusters == "19") #Other
obj.meta <- obj@meta.data
obj.meta$clust_chat <- c("")
obj.meta[colnames(obj_tcell), ] $clust_chat <- "Tcells"
obj.meta[colnames(obj_fib), ] $clust_chat <- "Fibroblasts"
obj.meta[colnames(obj_mye), ] $clust_chat <- "Myeloid"
obj.meta[colnames(obj_end), ] $clust_chat <- "Endothelial"
obj.meta[colnames(obj_kera), ] $clust_chat <- "Keratinocytes"
obj.meta[colnames(obj_mus), ] $clust_chat <- "Muscle"
obj.meta[colnames(obj_mast), ] $clust_chat <- "Mastcells"
obj.meta[colnames(obj_mela), ] $clust_chat <- "Melanocytes"
obj.meta[colnames(obj_lym), ] $clust_chat <- "Lymphatic"
obj.meta[colnames(obj_other), ] $clust_chat <- "Other"
obj.meta.use <- c(colnames(obj_tcell),colnames(obj_fib),colnames(obj_mye),colnames(obj_end),colnames(obj_kera),colnames(obj_mus),colnames(obj_mast),colnames(obj_mela),colnames(obj_lym),colnames(obj_other))
obj <- obj[, obj.meta.use]
obj.meta <- obj.meta[obj.meta.use, ]
obj@meta.data <- obj.meta
saveRDS(obj, file = paste0(wd, "/scalp.rds"))